%config InlineBackend.figure_formats = ["retina"]
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import scanpy as sc
from scipy import stats
from more_itertools import unique_everseen
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.n_jobs = 16
import anndata2ri
anndata2ri.activate()
%reload_ext rpy2.ipython
plt.rcParams.update({"font.family":"Reem Kufi"})
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
heatcolors = matplotlib.colors.LinearSegmentedColormap.from_list("", ["gray", "#000004FF", "#330A5FFF", "#781C6DFF",
"#BB3754FF", "#ED6925FF",
"#FCB519FF", "#FCFFA4FF"])
heatcolors_wr = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "#FFF5F0", "#FEE0D2", "#FCBBA1",
"#FC9272", "#FB6A4A", "#EF3B2C",
"#CB181D", "#A50F15", "#67000D"])
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
%%R
suppressWarnings(suppressMessages(require(scran)))
suppressWarnings(suppressMessages(require(scater)))
suppressWarnings(suppressMessages(library(DropletUtils)))
suppressWarnings(suppressMessages(library(batchelor)))
options(mc.cores = 24L)
%%R
# Load the individual data
## Guo Cairns data
SRR6860519 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860519/outs/filtered_feature_bc_matrix/")
colData(SRR6860519)$Library <- rep("SRR6860519", ncol(SRR6860519))
colData(SRR6860519)$Sample <- rep("Donor1", ncol(SRR6860519))
SRR6860520 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860520/outs/filtered_feature_bc_matrix/")
colData(SRR6860520)$Library <- rep("SRR6860520", ncol(SRR6860520))
colData(SRR6860520)$Sample <- rep("Donor1", ncol(SRR6860520))
SRR6860521 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860521/outs/filtered_feature_bc_matrix/")
colData(SRR6860521)$Library <- rep("SRR6860521", ncol(SRR6860521))
colData(SRR6860521)$Sample <- rep("Donor2", ncol(SRR6860521))
SRR6860522 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860522/outs/filtered_feature_bc_matrix/", )
colData(SRR6860522)$Library <- rep("SRR6860522", ncol(SRR6860522))
colData(SRR6860522)$Sample <- rep("Donor2", ncol(SRR6860522))
SRR6860523 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860523/outs/filtered_feature_bc_matrix/", )
colData(SRR6860523)$Library <- rep("SRR6860523", ncol(SRR6860523))
colData(SRR6860523)$Sample <- rep("Donor3", ncol(SRR6860523))
SRR6860524 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860524/outs/filtered_feature_bc_matrix/", )
colData(SRR6860524)$Library <- rep("SRR6860524", ncol(SRR6860524))
colData(SRR6860524)$Sample <- rep("Donor3", ncol(SRR6860524))
## Brian Hermann data
Adult3 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Hermann_McCarrey/AdultHuman_17-3/outs/filtered_feature_bc_matrix", )
colData(Adult3)$Library <- rep("Adult3", ncol(Adult3))
colData(Adult3)$Sample <- rep("Adult3", ncol(Adult3))
Adult4 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Hermann_McCarrey/AdultHuman_17-4/outs/filtered_feature_bc_matrix", )
colData(Adult4)$Library <- rep("Adult4", ncol(Adult4))
colData(Adult4)$Sample <- rep("Adult4", ncol(Adult4))
Adult5 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Hermann_McCarrey/AdultHuman_17-5/outs/filtered_feature_bc_matrix", )
colData(Adult5)$Library <- rep("Adult5", ncol(Adult5))
colData(Adult5)$Sample <- rep("Adult5", ncol(Adult5))
%%R -o hs_sce
# Merge all datasets
hs_sce <- cbind(SRR6860519, SRR6860520, SRR6860521, SRR6860522, SRR6860523, SRR6860524, Adult3, Adult4, Adult5)
rownames(hs_sce) <- rowData(hs_sce)$Symbol
print(hs_sce)
%%R
bcrank <- barcodeRanks(counts(hs_sce))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
MT.Genes <- grep("^MT-", rownames(hs_sce), value = T)
print(MT.Genes)
%%R
# Calculate QC metrics
hs_sce <- addPerCellQC(hs_sce, subsets = list(mt=MT.Genes))
#Remove low quality cells i.e., those with less than 200 genes expressed and less than 3500 UMI counts
hs_sce <- hs_sce[,hs_sce$detected > 200 & hs_sce$sum > 3500 & hs_sce$subsets_mt_percent < 20]
# Remove unxpressed genes
hs_sce <- hs_sce[Matrix::rowSums(counts(hs_sce)) > 0, ]
%%R
print(hs_sce)
%%R
# Normalization
set.seed(55063)
clusters <- quickCluster(hs_sce)
hs_sce <- computeSumFactors(hs_sce, cluster = clusters)
hs_sce <- logNormCounts(hs_sce)
set.seed(55063)
mod_mf <- modelGeneVar(hs_sce)
hvgs_mf <- getTopHVGs(mod_mf, prop = 0.1)
length(hvgs_mf)
%%R
# Dimensionality reduction
set.seed(55063)
hs_sce <- runPCA(hs_sce, subset_row = hvgs_mf)
hs_sce <- runTSNE(hs_sce, dimred = "PCA", num_threads = 24, verbose = F)
%%R
# Perform Batch correction
# Identify HVGs based on sample
set.seed(55063)
mod_hs_batch <- modelGeneVar(hs_sce, block = colData(hs_sce)$Library)
hvgs_hs_batch <- getTopHVGs(mod_hs_batch, prop = 0.1)
set.seed(55063)
hs_sce_corrected <- fastMNN(hs_sce, subset.row = hvgs_hs_batch, batch=hs_sce$Library)
set.seed(55063)
hs_sce_corrected <- runTSNE(hs_sce_corrected, dimred = "corrected", perplexity=300, num_threads = 12, verbose = F)
reducedDim(hs_sce, "mnn") <- reducedDim(hs_sce_corrected, "corrected")
reducedDim(hs_sce, "mnnTSNE") <- reducedDim(hs_sce_corrected, "TSNE")
%%R -o hs_sce
length(hvgs_hs_batch)
hs_sce.obs.index = hs_sce.obs['Barcode'].astype('str')+'-'+hs_sce.obs['Sample'].astype('str')
hs_sce.var_names_make_unique()
hs_sce.obs['Sample'].value_counts()
hs_sce.obs['sum'].describe()
hs_sce.obs['detected'].describe()
sc.pl.tsne(hs_sce, color = ["Sample"], cmap=heatcolors, size = 8, edgecolor = "black", linewidths = 0.1,
ncols = 2, legend_fontsize = 8, title="Before batch correction")
hs_sce.obsm['X_tsne'] = hs_sce.obsm['mnnTSNE']
sc.pl.tsne(hs_sce, color = ["Sample"], cmap=heatcolors,
size = 8,
edgecolor = "black",
linewidths = 0.1,
ncols = 2,
legend_fontsize = 8, title="After batch correction")
hs_sce.obsm["X_pca"] = hs_sce.obsm["mnn"]
sc.pp.neighbors(hs_sce, n_neighbors = 5)
sc.tl.leiden(hs_sce, resolution = 1.5, key_added = 'leiden_1.5')
sc.pl.tsne(hs_sce, color=["leiden_1.5"],
size=8,
legend_loc = "on data",
edgecolor = "black",
linewidths = 0.1,
legend_fontsize = 8,
alpha=0.2)
wv = sc.pl.stacked_violin(hs_sce, var_names=["DDX4", "VIM", "ID4", "REC8", "STRA8", "AURKA", "ACRV1",
"PRM2", "AMH", "CD74", "VWF", "ACTA2", "TAGLN", "CLU", "DCN",
"INSL3", "INHBA", "SOX9", "DLK1", "IGF1", "IGF2", "CFD", "SFRP1", "NR2F2"],
groupby="leiden_1.5", swap_axes=True, layer="logcounts")
hs_leiden_clusters = hs_sce.obs['leiden_1.5']
%%R -i hs_leiden_clusters
hs_sce$hs_leiden_clusters = hs_leiden_clusters
hs_sce$broadcluster <- as.character(hs_sce$hs_leiden_clusters)
%%R
hs_sce$broadcluster[hs_sce$broadcluster %in% c("14", "16", "0", "38", "12", "40", "41")] = "Somatic"
hs_sce$broadcluster[hs_sce$broadcluster %in% c("35","36", "37", "39", "28", "29", "33", "34")] = "Outliers"
hs_sce$broadcluster[!hs_sce$broadcluster %in% c("Outliers", "Somatic")] = "Germ"
%%R -o hs_sce_gs
# Seperate Germ and somatic cells
hs_sce_gs <- hs_sce[,hs_sce$broadcluster %in% c("Germ", "Somatic")]
hs_sce_gs <- hs_sce_gs[Matrix::rowSums(counts(hs_sce_gs)) > 0, ]
assay(hs_sce_gs, "normcounts") <- as(((2^logcounts(hs_sce_gs))-1), "dgCMatrix")
hs_sce_gs$hs_leiden_clusters <- as.character(hs_sce_gs$hs_leiden_clusters)
hs_sce_gs.obs.index = hs_sce_gs.obs['Barcode'].astype('str')+'-'+hs_sce_gs.obs['Sample'].astype('str')
hs_sce_gs.var_names_make_unique()
hs_sce_gs.obsm['X_tsne']=hs_sce_gs.obsm['mnnTSNE']
gsconditions = [
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['12', '40', '41'])),#Myoid
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['0'])),#Immature Leydig
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['38'])),#Sertoli
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['16'])),#Macrophage
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['14'])),#Endothelial
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['23', '13', '7', '19', '4'])),#Spermatogonia
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['8', '6', '18', '20', '31', '32', '17', '9', '26', '27', '21'])),#Spermatocytes
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['10', '15', '22', '24', '25'])),#Round spermatids
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['5', '1', '3', '2'])),#Elongating Spermatids
(hs_sce_gs.obs["hs_leiden_clusters"].isin(['11', '30']))]#Sperm
gsgroups = ["Myoid", "Immature Leydig", "Sertoli", "Macrophage", "Endothelial", "Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"]
gs_clusters = np.select(gsconditions, gsgroups, default="Germ")
hs_sce_gs.obs["broad_clusters"] = gs_clusters
hs_sce_gs.obs["broad_clusters"] = hs_sce_gs.obs["broad_clusters"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Sertoli", "Macrophage", "Endothelial",
"Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"],ordered=True)
hs_sce_gs.obs['hs_leiden_clusters_celltype'] = hs_sce_gs.obs['hs_leiden_clusters'].astype(str) +"-"+hs_sce_gs.obs['broad_clusters'].astype(str)
cols_clusters = ["#FFB292","gold", "#936C00","lime", "red", "#F16913", "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"]#"lightslategray",
sc.pl.tsne(hs_sce_gs, color=["broad_clusters"], legend_fontsize=10, palette=cols_clusters,
size=20, edgecolor="black", linewidths=0.1, title="Broad cell types")
sc.tl.dendrogram(hs_sce_gs, groupby="hs_leiden_clusters_celltype")
sc.pl.correlation_matrix(hs_sce_gs, groupby="hs_leiden_clusters_celltype", figsize=(8,6))
sc.tl.rank_genes_groups(hs_sce_gs, "broad_clusters", n_genes = 6000, method = "wilcoxon", layer = "logcounts", use_raw = False)
gslist = {}
for i in hs_sce_gs.obs["broad_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(hs_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by=["logfoldchanges"], ascending=[False]).dropna(axis=0)["names"].to_list()
gslist[i] = genes
for key, value in gslist.items():
#print value
print(key, len([item for item in value if item]))
gs_L1 = []
for i in hs_sce_gs.obs["broad_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(hs_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
gs_L1.extend(genes)
gs_unique_genes = list(unique_everseen(gs_L1))
len(gs_unique_genes)
mat = sc.pl.matrixplot(hs_sce_gs, gs_unique_genes, groupby="broad_clusters", figsize=(4, 4),standard_scale="var",
linewidth=.000001,swap_axes=True, cmap=heatcolors_wr, dendrogram=False, layer="logcounts",
show=False, show_gene_labels=False)
%%R -o hs_sce_germ
# Seperate Germ and somatic cells
## GERM
hs_sce_germ <- hs_sce[,hs_sce$broadcluster=="Germ"]
hs_sce_germ <- hs_sce_germ[Matrix::rowSums(counts(hs_sce_germ)) > 0, ]
hs_sce_germ$hs_leiden_clusters <- factor(hs_sce_germ$hs_leiden_clusters)
# Identify HVGs based on sample
set.seed(55063)
mod_hs_batch_germ <- modelGeneVar(hs_sce_germ, block=colData(hs_sce_germ)$Library)
hvgs_hs_batch_germ <- getTopHVGs(mod_hs_batch_germ, prop=0.1)
print(length(hvgs_hs_batch_germ))
set.seed(55063)
hs_sce_germ_corrected <- fastMNN(hs_sce_germ, subset.row=hvgs_hs_batch_germ, batch=hs_sce_germ$Library)
reducedDim(hs_sce_germ, "mnn") <- reducedDim(hs_sce_germ_corrected, "corrected")
hs_sce_germ$hs_leiden_clusters <- as.character(hs_sce_germ$hs_leiden_clusters)
hs_sce_germ.obs.index = hs_sce_germ.obs['Barcode'].astype('str')+'-'+hs_sce_germ.obs['Sample'].astype('str')
hs_sce_germ.var_names_make_unique()
hs_sce_germ.obsm['X_tsne'] = hs_sce_germ.obsm['mnnTSNE']
sc.pl.tsne(hs_sce_germ, color=["UTF1", "ID4","FGFR3", "SOHLH1", "UCHL1","DMRT1", "KIT", "STRA8", "REC8",
"PRDM9","SPO11","DMC1","MEIOB", "BRCA2","ATR","SYCP1","SMC1B", "SMC3","PIWIL1",
"PIWIL2", "RAD51", "SYCP2","SYCP3","HORMAD1","HORMAD2", "AURKA", "PLK1",
"NME5","SPACA1", "ACRV1", "AKAP14","TNP2","PRM2", "CRISP2", "TPPP2", "OAZ3"],
legend_fontsize=8, color_map=heatcolors, size=20, edgecolor="black", linewidths=0.1, wspace=0.1,
hspace=0.2, ncols=4, layer="logcounts")
hs_sce_germ.obsm["X_pca"] = hs_sce_germ.obsm["mnn"]
hs_sce_germ.obs["hs_leiden_clusters"].cat.reorder_categories(['23', '13', '7', '19', '4', '8', '6', '18', '20', '31',
'32', '17', '9', '26', '27', '21', '10', '15', '22', '24',
'25', '5', '1', '3', '2', '11', '30'], inplace = True)
germADconditions = [
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["23"])),#Undiff1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["13"])),#Undiff2
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["7"])),#E-diff
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["19"])),#Diff1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["4"])),#Diff2
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["8"])),#pre-Lep
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["6"])),#Lep1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["18"])),#Lep2
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["20"])),#Zyg
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["31"])),#P1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["32"])),#P2
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["17"])),#P3
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["9"])),#P4
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["26"])),#P5
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["27"])),#P6
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["21"])),#Dip & Sec. spermatocytes
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["10"])),#RS1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["15"])),#RS2
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["22"])),#RS3
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["24"])),#RS4
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["25"])),#RS5
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["5"])),#ES1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["1"])),#ES2
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["3"])),#ES3
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["2"])),#ES4
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["11"])),#Sperm1
(hs_sce_germ.obs["hs_leiden_clusters"].isin(["30"]))]#Sperm2
germADgroups = ["Undiff1", "Undiff2", "E-diff", "Diff1", "Diff2", "pre-Lep", "Lep1", "Lep2", "Zyg", "Pach1","Pach2","Pach3","Pach4","Pach5","Pach6", "Dip & Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","ES1","ES2","ES3","ES4","Sperm1", "Sperm2"]
hs_leiden_clusters = np.select(germADconditions, germADgroups, default="Germ")
hs_sce_germ.obs["germAD_clusters"] = hs_leiden_clusters
hs_sce_germ.obs["germAD_clusters"] = hs_sce_germ.obs["germAD_clusters"].astype("category").cat.reorder_categories(["Undiff1", "Undiff2", "E-diff", "Diff1", "Diff2", "pre-Lep", "Lep1", "Lep2", "Zyg", "Pach1","Pach2","Pach3","Pach4","Pach5","Pach6",
"Dip & Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","ES1","ES2","ES3","ES4","Sperm1", "Sperm2"],ordered=True)
cols_clusters = ["#FEEDDE", "#FDBE85", "#FD8D3C", "#E6550D", "#A63603",#Human
"#F7FCF5", "#DEE9DF", "#C5D7C9", "#ACC4B3", "#94B29D", "#7BA088", "#628D72", "#4A7B5C", "#316846", "#185630", "#00441B",
"#EFF3FF", "#BDD7E7", "#6BAED6", "#3182BD", "#08519C",
"#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
sc.pl.tsne(hs_sce_germ, color=["germAD_clusters"], size=10, palette=cols_clusters,
edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
hs_sce_gs.obs['Annotated'] = hs_sce_gs.obs['broad_clusters']
hs_sce_gs.obs['Annotated'] = hs_sce_gs.obs['Annotated'].astype('str')
for idx, x in hs_sce_germ.obs['germAD_clusters'].iteritems():
hs_sce_gs.obs.at[idx, 'Annotated'] = x
hs_sce_gs.obs["Annotated"] = hs_sce_gs.obs["Annotated"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Sertoli", "Macrophage", "Endothelial", "Undiff1", "Undiff2", "E-diff", "Diff1", "Diff2",
"pre-Lep", "Lep1", "Lep2", "Zyg", "Pach1","Pach2","Pach3","Pach4","Pach5","Pach6",
"Dip & Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","ES1","ES2","ES3","ES4","Sperm1", "Sperm2"],ordered=True)
palette = ["#FFB292","gold", "#936C00","lime", "red",
"#FEEDDE", "#FDBE85", "#FD8D3C", "#E6550D", "#A63603",#Human
"#F7FCF5", "#DEE9DF", "#C5D7C9", "#ACC4B3", "#94B29D", "#7BA088", "#628D72", "#4A7B5C", "#316846", "#185630", "#00441B",
"#EFF3FF", "#BDD7E7", "#6BAED6", "#3182BD", "#08519C",
"#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
sc.pl.tsne(hs_sce_gs, color=["Annotated"], size=10, palette=palette,
edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
hs_sce_germ.obs['Annotated'] = hs_sce_germ.obs['germAD_clusters']
sc.pp.neighbors(hs_sce_germ, n_neighbors=5) # PAGA - Partition-based graph abstraction
sc.tl.paga(hs_sce_germ, groups="germAD_clusters") # PAGA - Partition-based graph abstraction for germ cells
sc.pl.paga(hs_sce_germ, color="germAD_clusters", threshold=0.15, fontsize=8, fontoutline=1, frameon=False)
sc.tl.rank_genes_groups(hs_sce_germ, "germAD_clusters", n_genes=6000, method="wilcoxon", layer="logcounts", use_raw=False)
germlist = {}
for i in hs_sce_germ.obs["germAD_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(hs_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
germlist[i] = genes
germlist_L1 = []
for i in hs_sce_germ.obs["germAD_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(hs_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
germlist_L1.extend(genes)
# Obtain unique number of differentially expressed genes
germ_unique_genes = list(unique_everseen(germlist_L1))
len(germ_unique_genes)
ax = sc.pl.matrixplot(hs_sce_germ, germlist_L1, groupby="germAD_clusters", figsize=(10,6),standard_scale="var",
linewidth=.0000001, swap_axes=True, cmap=heatcolors_wr, dendrogram=False, layer="logcounts",
show_gene_labels=False)
hs_sce_germ.X = hs_sce_germ.layers["logcounts"]
# Load gene annnotation file
allgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Human/Human_genes_annotation_gtf.csv")
allgenes.index = allgenes['gene_name'].tolist()
allgenes = allgenes[['gene_id', 'seqname']]
allgenes.columns = ['GeneStableID', 'Chromosome']
allgenes = allgenes[allgenes['Chromosome'].isin(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20',
'21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y'])]
for ch in allgenes['Chromosome'].astype("category").cat.categories.tolist():
genes = allgenes["Chromosome"]==ch
Agenes = set(allgenes[genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
hs_sce_germ.obs[ch] = np.ravel((hs_sce_germ[:, list(Agenes)].X != 0).sum(axis=1))
AllChr_df = pd.DataFrame()
for column in ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
'2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y']:
x = hs_sce_germ.obs[column]
AllChr_df[column] = x
AllChr_df['germAD_clusters'] = hs_sce_germ.obs['germAD_clusters']
AllChr_df_melted = AllChr_df.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
AllChr_df_melted_mean = AllChr_df_melted.groupby(["germAD_clusters", "key"], as_index=False).mean()
#Reshape the data
AllChr_df_mean_reshaped = AllChr_df_melted_mean.pivot(index="key",columns="germAD_clusters")
AllChr_df_melted_std = AllChr_df_melted.groupby(["germAD_clusters", "key"]).std().reset_index()
AllChr_df_melted_std_reshaped = AllChr_df_melted_std.pivot(index="key",columns="germAD_clusters")
plt.rcParams["figure.figsize"]=6,3
cols = ["#87CEEB", "#83C8EA", "#7FC3E9", "#7BBEE9", "#78B8E8", "#74B3E8",
"#70AEE7", "#6DA8E7", "#69A3E6", "#659EE6", "#6298E5", "#5E93E5",
"#5A8EE4", "#5788E4", "#5383E3", "#4F7EE3", "#4C78E2", "#4873E2",
"#446EE1", "#4169E1", "#446EE1", "#4169E1", "darkorange"]
i=0
for c in ['1', '2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16',
'17', '18', '19', '20', '21', '22', 'Y']:
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(),
AllChr_df_mean_reshaped.loc[c], linewidth=0.75, color=cols[i], alpha=1, label=c)
i+=1
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(),
AllChr_df_mean_reshaped.iloc[21], linewidth=1.5, marker='', color= "#C70039", label="X")
# Add legend
plt.legend(loc="center", ncol=2, fontsize=6, bbox_to_anchor=(1.06, 0.58), frameon=False)
# Add titles
#plt.title("Mean no. of genes", loc='left', fontsize=12, fontweight=2, color='black')
plt.ylabel("Mean no. of genes", fontsize=8)
plt.xlabel("")
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor")
plt.tick_params(axis="both", which="minor", left=False)
plt.tick_params(axis="both", which="major", bottom=True, top=False, left=True, labelbottom=True, pad=0.5, length=1.5, labelsize=8)
plt.yscale('log', basey=10)
plt.tight_layout()
plt.margins(0.003,0.003)
plt.grid(linestyle='-', linewidth='0.1')
# Retrive chromosomes X, Y, 9 and autosomal genes for X to A ratio
chrX_genes = allgenes["Chromosome"]=="X"
chrY_genes = allgenes["Chromosome"]=="Y"
chr9_genes = allgenes["Chromosome"]=="9"
chrA_genes = allgenes[~allgenes.Chromosome.isin(["X","Y", "MT"])]
# Retrieve the X and Y genes specifically present in the germ cells alone
germ_Xgenes = set(allgenes[chrX_genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
germ_Ygenes = set(allgenes[chrY_genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
germ_9genes = set(allgenes[chr9_genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
germ_Agenes = set(chrA_genes.index.tolist()).intersection(hs_sce_germ.var_names.tolist())
hs_sce_germ.obs["Chr X"] = np.mean(
hs_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.mean(hs_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
hs_sce_germ.obs["Chr 9"] = np.mean(
hs_sce_germ[:, list(germ_9genes)].X, axis=1).A1 / np.mean(hs_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
germXA_data = hs_sce_germ.obs["germAD_clusters"].to_frame().join(hs_sce_germ.obs["Chr 9"]).join(hs_sce_germ.obs["Chr X"])#.join(hs_sce_germ.obs["Chr Y"])
#Melt the data to long shape
germXA_data_melted = germXA_data.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
from scipy import stats
stats.mannwhitneyu(germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Diff2'])) & (germXA_data_melted['key'].isin(['Chr X']))].value,
germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Zyg'])) & (germXA_data_melted['key'].isin(['Chr X']))].value)
from matplotlib.ticker import FormatStrFormatter
plt.rcParams["figure.figsize"]=6,3
bx = sns.violinplot(x="germAD_clusters", y="value",
hue="key", palette=["#0D8DB3","#C70039"],fliersize=0.1,linewidth=0.5,split=True,scale="count",inner="quartile",
data=germXA_data_melted)
bx.set(xlabel="")
bx.set_ylabel("Chromosome:Autosome ratio", fontsize=8)
bx.set_xticklabels(bx.get_xticklabels(), rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor", visible=True)
bx.grid(linestyle='-', linewidth='0.1')
bx.tick_params(
axis="both", # changes apply to the x-axis
which="major", # both major and minor ticks are affected
bottom=True, # ticks along the bottom edge are off
top=False, left=True, # ticks along the top edge are off
labelbottom=True,labeltop=False, pad=0.5, length=1.5)
bx.axhline(y=0.5,color='gray',linestyle='--')
bx.axhline(y=1,color='gray',linestyle='--')
plt.legend(loc="upper right", fontsize=6, frameon=False)
plt.tight_layout()
hs_sce_germ.obs["ChrX percent"] = (np.sum(
hs_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.sum(hs_sce_germ.X, axis=1).A1)*100
hs_sce_germ.obs["ChrY percent"] = (np.sum(
hs_sce_germ[:, list(germ_Ygenes)].X, axis=1).A1 / np.sum(hs_sce_germ.X, axis=1).A1)*100
#Make a dataframe using selected columns
germ_dist_data = hs_sce_germ.obs['germAD_clusters'].to_frame().join(hs_sce_germ.obs['detected']).join(hs_sce_germ.obs['sum']).join(hs_sce_germ.obs['subsets_mt_percent']).join(hs_sce_germ.obs['ChrX percent']).join(hs_sce_germ.obs['ChrY percent'])
germ_dist_data.columns = ['germAD_clusters','No. of genes', 'UMI Count', '% Mito', '% ChrX', '% ChrY']
#Melt the data to long shape
germ_dist_data_melted = germ_dist_data.melt(id_vars='germAD_clusters', var_name='key', value_name='value')
g = sns.FacetGrid(germ_dist_data_melted, col="key", height=6, sharex=False, hue="germAD_clusters", aspect=.5, palette=cols_clusters)
g.map(sns.violinplot, "value", "germAD_clusters", label='xxlarge', linewidth=0.1).set_titles("{col_name}").set_axis_labels("","")
sc.pp.neighbors(hs_sce_germ, n_pcs=50, knn=False, method="gauss")# Find the neighbors
sc.tl.diffmap(hs_sce_germ)
hs_sce_germ.uns["iroot"] = np.flatnonzero(hs_sce_germ.obs["germAD_clusters"] == "Undiff1")[0] # Set the root cell
sc.tl.dpt(hs_sce_germ)
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
sc.pl.tsne(hs_sce_germ, color=["dpt_pseudotime"], legend_fontsize=8, cmap= "viridis",
size=20, edgecolor="black", linewidths=0.05) #Use gauss method to find neighbors to get proper pseudotime
# Load chr X gene annnotation file
Xgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Human/Human_Xgenes_Annotated_updated.csv")
Xgenes.index = Xgenes['gene_name'].tolist()
Xgenes = Xgenes[~Xgenes.index.duplicated(keep='last')]
Xgenes['GeneClass'].value_counts()
Xgenes[['Ancestral', 'Shared']] = Xgenes[['Ancestral','Shared']].fillna(value="NotShared")
# Subset data based on different classes of X chromosome genes
germXsingle_sce = hs_sce_germ[:,hs_sce_germ.var['Symbol'][hs_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Single-copy"])].tolist()]
germXsingle_sce.var_names_make_unique()
germXsingle_sce.obs_names_make_unique()
germXMulti_sce = hs_sce_germ[:,hs_sce_germ.var['Symbol'][hs_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Multicopy"])].tolist()]
germXMulti_sce.var_names_make_unique()
germXMulti_sce.obs_names_make_unique()
germXAmpliconic_sce = hs_sce_germ[:,hs_sce_germ.var['Symbol'][hs_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Ampliconic"])].tolist()]
germXAmpliconic_sce.var_names_make_unique()
germXAmpliconic_sce.obs_names_make_unique()
germY_sce = hs_sce_germ[:,list(germ_Ygenes)]
germY_sce.var_names_make_unique()
germY_sce.obs_names_make_unique()
sc.tl.rank_genes_groups(germXsingle_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXMulti_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXAmpliconic_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germY_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
singleX_L1 = []
for i in germXsingle_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXsingle_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
singleX_L1.extend(genes)
MultiX_L1 = []
for i in germXMulti_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXMulti_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
MultiX_L1.extend(genes)
Ampli_L1 = []
for i in germXAmpliconic_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXAmpliconic_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
Ampli_L1.extend(genes)
singleY_L1 = []
for i in germY_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germY_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
singleY_L1.extend(genes)
singleX_unique = pd.Series(singleX_L1).drop_duplicates().tolist()
MultiX_unique = pd.Series(MultiX_L1).drop_duplicates().tolist()
Ampli_unique = pd.Series(Ampli_L1).drop_duplicates().tolist()
singleY_unique = pd.Series(singleY_L1).drop_duplicates().tolist()
print("No. of single X copy genes: %d" %len(singleX_unique))
print("No. of multi X copy genes: %d" %len(MultiX_unique))
print("No. of ampliconic X genes: %d" %len(Ampli_unique))
print("No. of Y genes: %d" %len(singleY_unique))
XYgenes = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique, 'Y':singleY_unique}
X_sma = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique}
ax = sc.pl.matrixplot(hs_sce_germ, X_sma, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.001,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
ax = sc.pl.matrixplot(hs_sce_germ, {'Multi':MultiX_unique, 'Ampli':Ampli_unique}, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
Xgenes_hs = Xgenes[Xgenes['gene_name'].isin(hs_sce_germ.var['Symbol'])]
Xgenes_hs['GeneClass'].value_counts()
pd.crosstab(Xgenes_hs['GeneClass'], Xgenes_hs['Shared'])
MultiX_unique_shared = {}
for i in Xgenes_hs['Shared'].astype('category').cat.categories:
genes = [x for x in MultiX_unique if x in Xgenes_hs['gene_name'][(Xgenes_hs['GeneClass']=='Multicopy') & (Xgenes_hs['Shared']==i)].tolist()]
MultiX_unique_shared[i] = genes
ax = sc.pl.matrixplot(hs_sce_germ, MultiX_unique_shared, groupby="Annotated", figsize=(10,7),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
AmpliX_unique_shared = {}
for i in Xgenes_hs['Shared'].astype('category').cat.categories:
genes = [x for x in Ampli_unique if x in Xgenes_hs['gene_name'][(Xgenes_hs['GeneClass']=='Ampliconic') & (Xgenes_hs['Shared']==i)].tolist()]
AmpliX_unique_shared[i] = genes
ax = sc.pl.matrixplot(hs_sce_germ, AmpliX_unique_shared, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)
sc.pl.dotplot(hs_sce_germ,singleY_unique, groupby="Annotated", figsize=(10,6), standard_scale="var",color_map=heatcolors_wr,
dendrogram=False, layer="logcounts")
hs_unique = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/17Apr_rev/Hs_unique.txt")
Orthologs = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/Human_Mouse_Monkey_orthologs_21Feb_clean.txt", sep=" ")
Orthologs = Orthologs[(Orthologs.Crab_eating_macaque_homology_type=="ortholog_one2one") & (Orthologs.Mouse_homology_type=="ortholog_one2one")]
hs_ortho_df = pd.DataFrame(columns=["1:1:1 Othologs", "Human-specific", "Other orthologs"])
for i in hs_sce_germ.obs['germAD_clusters'].astype('category').cat.categories.tolist():
df = i+"_df"
df = hs_sce_germ[hs_sce_germ.obs['germAD_clusters'].isin([i]),:];
sc.pp.filter_genes(df, min_cells=round((df.n_obs*0.05)));
hs_ortho_df.loc[i] = [len(df.var['ID'][df.var['ID'].isin(Orthologs.Gene_stable_ID.tolist())])/len(df.var['ID'])*100,
len(df.var['ID'][df.var['ID'].isin(hs_unique['x'].tolist())])/len(df.var['ID'])*100,
len(df.var['ID'][~df.var['ID'].isin(Orthologs.Gene_stable_ID.tolist()+hs_unique['x'].tolist())])/len(df.var['ID'])*100]
hs_ortho_df['Groups'] = hs_ortho_df.index
plt.rcParams["figure.figsize"]=3,10
hs_ortho_df_melted = hs_ortho_df.melt('Groups', var_name='Type', value_name='value')
g = sns.factorplot(x="Groups", y="value", hue='Type', data=hs_ortho_df_melted, kind="point",
palette = ["#DF8F44FF", "#AD4A38", "#80796BFF"], height=5, aspect=1.3, scale=0.6)
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor");
plt.xlabel("");
plt.ylabel("% Genes expressed", fontsize=10);